loan <- read.csv("loan.csv", stringsAsFactors = FALSE)
head(loan, 3)
rough ideas :
predict the grade or sub_grade assigned by LC(classification problem)
predict the loan_status (classification problem)
predict int_rate(interest rate loan) (regression problem)
these problem are interested by borrowers
rough ideas:
clean data
explore data
variable identification
missing value treatment
variable analysis
feature engineering
build a regression model
since this data is already table like, we can skip this step
variable identification
type of variable: predictors(features), response(target)
data type: character, numeric, object
variable category: continuous, categorical
# int_rate is the response
response = loan$int_rate
# all others are features, where `which` find the index of column "int_rate"
features = loan[, -which(names(loan) == "int_rate")]
numerical
continous : e.g. int_rate
discrete : e.g. inq_fi(Number of personal finance inquiries)
categorical
ordinal : e.g. grade
nominal : e.g. zip code
ATTENTION: data type may not like the type it loaded into
# load library
library(ggplot2)
# apply function class to all load columns
types = sapply(loan, class)
# convert types to data.frame
# `table` calculate the frequency
df.types = data.frame(table(types), row.names = NULL)
# visualization
options(repr.plot.width=4, repr.plot.height=2)
ggplot(data = df.types,
aes(x = types, y = Freq/sum(Freq))) +
geom_bar(stat = "identity", width = 0.2)
# Check variable one by one
# addr_state(categorical: nomial)
# emp_length(numeric: discrete ) change from string to integer
substr(loan$emp_length[1], 1, 2) # not numerical
# deinfe a function extract numbers from string
str2num <- function(str){as.numeric(gsub("([^0-9])", "", str))}
loan$emp_length = sapply(loan$emp_length, str2num)
# emp_title(categorical: nominal)
# home_ownership (categorical: nominal)
# member_id(categorical: nominal)
# zip_code(categorical: nominal)
# annual_inc(numerical: continuous)
# annual_inc_joint(numerical: continuous)
# is_inc_v(binary) : missing feature
# verified_status_joint(categorical: nominal)
# unique(loan$verification_status_joint)
# dti(numerical: continous)
# dti_joint(numerical: continous)
# earliest_cr_line(categorica:nominal) could be splitted to year and mon
library(zoo)
loan$earliest_cr_line_date = as.Date(as.yearmon(loan$earliest_cr_line, "%b-%Y"))
loan$earliest_cr_line_year = format(loan$earliest_cr_line_date, "%Y")
loan$earliest_cr_line_mon = format(loan$earliest_cr_line_date, "%m")
# fico_range_high: missing
# fico_range_low: missing
# last_fico_range_high: missing
# last_fico_range_low: missing
# inq_fi(numerical : discrete)
# inq_last_12m(numerical : discrete)
# inq_last_6mths(numerical : discrete)
# last_credit_pull_d(categorical) could be splitted to year and mon
loan$last_credit_pull_d_date = as.Date(as.yearmon(loan$last_credit_pull_d, "%b-%Y"))
loan$last_credit_pull_d_year = format(loan$last_credit_pull_d_date, "%Y")
loan$last_credit_pull_d_mon = format(loan$last_credit_pull_d_date, "%m")
# total_acc(numerical : discrete)
# tot_cur_bal(numerical : continous)
# all_util(numerical : continous)
# open_acc(numerical : discrete)
# total_acc(numerical : discrete)
# tot_cur_bal(numerical : continous)
# all_util(numerical : continous)
# open_acc(numerical : discrete)
# open_acc_6m(numerical : discrete)
# total_cu_tl(numerical : discrete)
# acc_noew_delinq(numerical : discrete)
# delinq_2yrs(numerical : discrete)
# mths_since_last_delinq(numerical : discrete)
# collections_12_mths_ex_med(numerical : discrete)
# tot_coll_amt(numerical : discrete)
# pub_rec(numerical : discrete)
# mths_since_last_major_derog(numerical : discrete)
# mths_since_last_record(numerical : discrete)
# il_util(numerical : continuous)
# mths_since_rcnt_il(numerical : discrete)
# open_il_12m(numerical : discrete)
# open_il_24m(numerical : discrete)
# open_il_6m(numerical : discrete)
# total_bal_il(numerical : continuous)
# max_bal_bc(numerical : continuous)
# open_rv_12m(numerical : discrete)
# open_rv_24m(numerical : discrete)
# revol_bal(numerical : continuous)
# revol_util(numerical : continuous)
# total_rev_hi_lim(numerical : continuous)
# policy_code(binary) : no use, unique value
# url(string): not useful
# application_type(categorical : nominal)
# desc(string) might be useful, but need nlp
# id : not useful
# purpose(categorical : nominal)
# term(categorical : nominal)
# title(string) might be useful, but need nlp
# issue_d(categorical) could be splitted to year and mon
loan$issue_d_date = as.Date(as.yearmon(loan$issue_d, "%b-%Y"))
loan$issue_d_year = format(loan$issue_d_date, "%Y")
loan$issue_d_mon = format(loan$issue_d_date, "%m")
# inital_list_status(binary)
# loan_amnt(numerical : continuous)
# installment(numerical : continuous)
# funded_amnt(numerical : continuous)
# funded_amnt_inv(numerical : continuous)
# last_pymnt_amnt(numerical : continuous)
# last_pymnt_d(categorical)
loan$last_pymnt_d_date = as.Date(as.yearmon(loan$last_pymnt_d, "%b-%Y"))
loan$last_pymnt_d_year = format(loan$last_pymnt_d_date, "%Y")
loan$last_pymnt_d_mon = format(loan$last_pymnt_d_date, "%m")
# next_pymnt_d(categorical)
loan$next_pymnt_d_date = as.Date(as.yearmon(loan$next_pymnt_d, "%b-%Y"))
loan$next_pymnt_d_year = format(loan$next_pymnt_d_date, "%Y")
loan$next_pymnt_d_mon = format(loan$next_pymnt_d_date, "%m")
# pymnt_plan(binary)
# recoveries(numerical : continuous)
# total_pymnt(numerical : continuous)
# total_pymnt_inv(numerical : continuous)
# total_rec_int(numerical : continuous)
# total_rec_late_fee(numerical : continuous)
# total_rec_prncp(numerical : continuous)
# collection_recovery_fee(numerical : continuous)
# out_prncp(numerical : continuous)
# out_prncp_inv(numerical : continuous)
# Potential response variables -------------------------------
# grade(categorical : ordinal)
# plot(table(loan$grade))
# sub_grade(categorical : ordinal)
# plot(table(loan$sub_grade))
# int_rate(numerical : continuous)
# hist(loan$int_rate)
# loan_status(categorical : nominal)
# unique(loan$loan_status)
univariate analysis
▪ continuous
▪ categorical
bi-variate analysis
▪ continuous & continuous
▪ categorical & categorical
▪ continuous & categorical
• common statistics : mean, median, skewness, ...
• data visualization : barplot, histogram, density plot, ...
• hpyothesis test : Chi2-test, anova, t-test, ...
Since we focus on int_rate, which is response, basicly, we can think, what we are doing is try to estimate the density of P(response|features)P(response|features) int_rate : Interest Rate on the loan since this is continuous variable, we can look at its basic stats, and histogram, density plot, boxplot
# stats
"range and quantile"
summary(loan$int_rate)
"sd"
sd(loan$int_rate)
analysis: positive skew, but not strong, which means we could could use nomarl distribution to model this curve, but we need more analysis
plothistdensity <- function(data, nbins){
options(repr.plot.width=8, repr.plot.height=2)
p <- ggplot(data, aes(x=int_rate, y = ..density..))
p <- p + geom_histogram(aes(y=..count../sum(..count..)),
bins = nbins,
color = "grey",
fill="orange")
p <- p + geom_density()
p + theme_dark() +
ylab("Normalized_Count") +
ggtitle(paste(paste("int_rate : ", nbins), " bins histogram with density" ))
}
plothistdensity(loan, 10)
plothistdensity(loan, 20)
plothistdensity(loan, 50)
plothistdensity(loan, 100)
analysis: usually histogram could roughly estimate pdf of a random variable, however, to get more stable and generalize result we could use KDE to estimate density of a random variable. From the density plot, we observe many peaks, which indicates this a multimodal distribution. This means there are different patterns inside the distribution, usually, it is a mixture distribution. This pattern may cause by groups(usage, years, repuations, states, ...)
int_rate : Interest Rate on the loan
grade : LC assigned loan grade
options(repr.plot.width=4, repr.plot.height=2)
ggplot(data.frame(table(loan$grade)), aes(x=Var1, y=Freq)) + geom_bar(stat = "identity")
# group histogram
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=int_rate, fill=grade)) +
geom_histogram(bins = 100, color = "white") + theme_light() +
geom_freqpoly(bins = 30, color = "grey")
# group density
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=int_rate, y= ..density.., fill=grade)) +
theme_light() +
geom_density(bw = 0.3, alpha= 0.6)
Even we group int_rate by grade, it still shows multimodality inside each group, we can dig into it latter, but, clearly it shows grade was directly related to int_rate
# boxplot 1
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(grade), y=int_rate)) +
geom_boxplot() + labs(x='grade') +
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")
box plot shows the skewness, quantile, outliers of each group, but it cannot show the amount of data in each group
table(loan$grade)
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(grade), y=int_rate)) +
geom_jitter(alpha=0.3, color='lightgrey') +
geom_boxplot(color='steelblue') +
labs(x='grade')
we can also use a violin plot to compare multiple distribution in different group
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(grade), y=int_rate)) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) #+ geom_boxplot(width=.1, fill="black", outlier.colour=NA) +
#stat_summary(fun.y=median, geom="point", fill="white", shape=21, size=2.5)
int_rate : Interest Rate on the loan
subgrade : LC assigned loan subgrade
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x=int_rate, y=..density.., fill = sub_grade)) +
geom_density(bw = 0.1, alpha= 0.6) + facet_grid(grade ~ .)
int_rate : Interest Rate on the loan
loan_status : Current status of the loan
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x=factor(loan_status), y=int_rate)) +
geom_jitter(alpha=0.3, color='lightgrey') +
geom_boxplot(color='steelblue') +
labs(x='loan_status') +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=0.8),legend.position = "none")
options(repr.plot.width=8, repr.plot.height=6)
ggplot(loan, aes(x=int_rate, y= ..density.., fill=loan_status)) +
theme_light() +
facet_grid(loan_status ~ .) +
geom_density(alpha= 0.6)
int_rate also influence by loan_status, and within each loan_status group, it appears more like a normal
library(choroplethr)
library(choroplethrMaps)
state.ave.int_rate = as.list(by(loan$int_rate, loan$addr_state, mean))
state.names = names(state.ave.int_rate)
names(state.ave.int_rate) = NULL
state.int_rate.df = data.frame(region = state.names, value = unlist(state.ave.int_rate))
full.name = c()
for(i in 1:51)
{
if(i==8){
full.name <- append(full.name, "district of columbia")
}else{
full.name <- append(full.name, tolower(state.name[state.int_rate.df[i,1] == state.abb]))
}
}
state.int_rate.df$region = full.name
options(repr.plot.width=16, repr.plot.height=8)
state_choropleth(state.int_rate.df)
# anova analysis on different groups
aov.res = aov(int_rate ~ factor(addr_state), na.action = na.omit, data = loan)
summary(aov.res)
As the p-value is less than the significance level 0.001, we can conclude that there are significant differences between the groups.
possible feature enigneering:
group median of int_rate on each state
set level of int_rate on each state
emp_length with int_rate : Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.
unique(loan$emp_length)
loan[which(loan$emp_length == "n/a"), "emp_length"] <- NA
sum(is.na(loan$emp_length))/dim(loan)[1]
# group density
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=int_rate, y= ..density.., fill=emp_length)) +
theme_light() +
facet_grid(emp_length ~ .) +
geom_density(alpha= 0.6)
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(emp_length), y=int_rate)) +
geom_boxplot() + labs(x='emp_length') +
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")
# anova analysis on different groups
aov.res = aov(int_rate ~ factor(emp_length), na.action = na.omit, data = loan, )
summary(aov.res)
# Tukey test to find significance between groups
tukey.res = TukeyHSD(aov.res, conf.level = 0.95)
tukey.res
# feature engineering : split emp_length to < 5 ; >= 5
loan$emp_length_sim = NA
loan[which(is.na(loan[,'emp_length'])),'emp_length']=median(loan[,'emp_length'],na.rm=T)
loan[which(loan$emp_length %in% c('10', '8','9','5','6','7')), "emp_length_sim"] = ">= 5"
loan[which(loan$emp_length %in% c('0', '1','2','3','4')), "emp_length_sim"] = "< 5"
table(loan$emp_length_sim)
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(emp_length_sim), y=int_rate)) +
geom_boxplot() + labs(x='emp_length_sim') +
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")
# t-test on group means
t.test(int_rate ~ emp_length_sim, data = loan, na.action = na.omit) # significant
nominal
# display top ten emp_title
loan$emp_title = tolower(loan$emp_title) # convert all string to lowercase
# convert " " to na
loan$emp_title[which(loan$emp_title == "")] = NA
sort(table(loan$emp_title), decreasing = T)[1:100] # possible text processing, na exists as a level
length(unique(loan$emp_title))
# percentage of missing
sum(is.na(loan$emp_title))/dim(loan)[1]
# bar plot
emp_title.pop = sort(table(loan$emp_title)[which(table(loan$emp_title)>500)], decreasing = T)
emp_title.pop = as.data.frame(emp_title.pop)
options(repr.plot.width=30, repr.plot.height=10)
ggplot(data = emp_title.pop, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity") +
# scale_y_log10() +
scale_x_discrete(name ="emp_titles",
limits=emp_title.pop$Var1)+
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
possible feature engineering : binning
# generate feature emp_title
#emp_title.ave_rate = as.list(by(loan$int_rate, loan$emp_title, median))
#emp_title.ave_rate = unlist(emp_title.ave_rate)
#loan$gen_emp_title_ave_rate = unlist(apply(loan, 1, function(x){return(emp_title.ave_rate[x["emp_title"]])}))
#plot(table(loan$gen_empt_title_ave_rate))
nominal
unique(loan$home_ownership)
sum(loan$home_ownership == "NONE") / dim(loan)[1]
#boxplot 1
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(home_ownership), y=int_rate)) +
#geom_jitter(alpha=0.3, color='lightgrey') +
geom_boxplot() + labs(x='home_ownership') +
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")
we can observe rough trend, that mortage < own < rent
# binning "ANY", "NONE", "OTHER"
loan$gen_home_ownership = loan$home_ownership
loan$gen_home_ownership[loan$home_ownership %in% c("ANY", "NONE", "OTHER")] = "others"
table(loan$gen_home_ownership)
sum(is.na(loan$zip_code))/dim(loan)[1]
head(table(loan$zip_code, loan$addr_state))
possible : feature engineering combine zip_code and addr_state to generate more accurate address
# possible : feature engineering combine zip_code and addr_state to generate more accurate address
loan$gen_add_state_zip = paste(loan$addr_state, as.character(loan$zip_code))
add_state_zip.ave_rate = as.list(by(loan$int_rate, loan$gen_add_state_zip, median))
add_state_zip.ave_rate = unlist(add_state_zip.ave_rate)
loan$gen_add_state_zip_ave_rate = unlist(apply(loan, 1, function(x){return(add_state_zip.ave_rate[x["gen_add_state_zip"]])}))
plot(table(loan$gen_add_state_zip_ave_rate))
length(which(is.na(loan$annual_inc) | (loan$annual_inc == 0)))/dim(loan)[1]
loan$annual_inc[which(is.na(loan$annual_inc) | (loan$annual_inc == 0))] = mean(loan$annual_inc, na.rm=T)
#loan$annual_inc[which(is.na(loan$annual_inc) == 0)] = mean(loan$annual_inc, na.rm=T)
range(loan$annual_inc)
# linear relationship
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x = annual_inc, y=int_rate)) + geom_hex() + stat_smooth(method=lm)
ggplot(loan, aes(x = annual_inc, y=int_rate)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon")
# linear relationship in log scale : possible feature engineering
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x = log(annual_inc), y=int_rate)) + geom_hex() + stat_smooth(method=lm)
cor(loan$annual_inc, loan$int_rate)
# possible feature
cor(log(loan$annual_inc), loan$int_rate)
# new feature
loan$gen_log_annual_inc = log(loan$annual_inc)
numerical: continuous
# drop this feature
sum(is.na(loan$annual_inc_joint))/dim(loan)[1]
categorical: nominal
sum(is.na(loan$verification_status))/dim(loan)[1]
unique(loan$verification_status)
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(verification_status), y=int_rate)) +
geom_boxplot() + labs(x='verification_status') +
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=int_rate, y= ..density.., fill=verification_status)) +
theme_light() +
geom_density(bw = 0.3, alpha= 0.6)
aov.res = aov(int_rate ~ factor(verification_status), na.action = na.omit, data = loan, )
summary(aov.res)
As the p-value is less than the significance level 0.001, we can conclude that there are significant differences between the different verification status.
dti : A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
quantile(loan$dti)
# weak linear relationship
cor(loan$dti, loan$int_rate)
numerical: continuous drop this feature
sum(is.na(loan$dti_joint))/dim(loan)[1]
inq_fi : Number of personal finance inquiries
# drop this feature
sum(is.na(loan$inq_fi))/dim(loan)[1]
inq_last_12m : Number of credit inquiries in past 12 months
# drop this feature
sum(is.na(loan$inq_last_12m))/dim(loan)[1]
inq_last_6mths : The number of inquiries in past 6 months (excluding auto and mortgage inquiries)
sum(is.na(loan$inq_last_6mths))/dim(loan)[1]
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(inq_last_6mths), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$inq_last_6mths)), aes(x=Var1, y=Freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=0, hjust=1, vjust=.5),
legend.position = "none")
# binning: > 4 is 5
loan$gen_inq_last_6mths = loan$inq_last_6mths
loan$gen_inq_last_6mths[loan$inq_last_6mths > 4] = 5
last_credit_pull_d : The most recent month LC pulled credit for this loan
sum(is.na(loan$last_credit_pull_d_date))/dim(loan)[1]
library("zoo")
loan$last_credit_pull_d_date = as.Date(as.yearmon(loan$last_credit_pull_d, "%b-%Y"))
options(repr.plot.width=20, repr.plot.height=8)
ggplot(loan, aes(x=factor(last_credit_pull_d_date), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='last_credit_pull_d_date') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$last_credit_pull_d_date)), aes(x=Var1, y=log(Freq))) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
total_acc : The total number of credit lines currently in the borrower's credit file
sum(is.na(loan$total_acc))/dim(loan)[1]
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(total_acc), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x="The total number of credit lines currently in the borrower's credit file") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$total_acc)), aes(x=Var1, y=Freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
could be modeled as log normal
tot_cur_bal : Total current balance of all accounts
sum(is.na(loan$tot_cur_bal))/dim(loan)[1]
# linear relationship
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x = tot_cur_bal, y=int_rate)) + geom_hex(bins=100) + stat_smooth(method=lm)
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x = log(tot_cur_bal), y=int_rate)) + geom_hex(bins=100) + stat_smooth(method=lm)
# new feature
loan$gen_log_tot_cur_bal = log(loan$tot_cur_bal)
all_util : Balance to credit limit on all trades
# drop this feature
sum(is.na(loan$all_util))/dim(loan)[1]
open_acc : The number of open credit lines in the borrower's credit file.
sum(is.na(loan$open_acc))/dim(loan)[1]
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(open_acc), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$open_acc)), aes(x=Var1, y=Freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
open_acc_6m : Number of open trades in last 6 months
# drop feature
sum(is.na(loan$open_acc_6m))/dim(loan)[1]
total_cu_tl : Number of finance trades
# drop feature
sum(is.na(loan$total_cu_tl))/dim(loan)[1]
acc_now_deling : The number of accounts on which the borrower is now delinquent.
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(acc_now_delinq), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
aov.res = aov(int_rate ~ factor(acc_now_delinq), na.action = na.omit, data = loan, )
summary(aov.res)
As the p-value is less than the significance level 0.001, we can conclude that there are significant differences between the different verification status.
delinq_2yrs : The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(delinq_2yrs), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$delinq_2yrs)), aes(x=Var1, y=log(Freq))) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x = inq_last_6mths, y=delinq_2yrs)) + geom_hex(bins=50) + stat_smooth(method=lm)
mths_since_last_delinq : The number of months since the borrower's last delinquency.
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(mths_since_last_delinq), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$mths_since_last_delinq)), aes(x=Var1, y=Freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
collections_12_mths_ex_med : Number of collections in 12 months excluding medical collections
table(loan$collections_12_mths_ex_med)
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(collections_12_mths_ex_med), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$collections_12_mths_ex_med)), aes(x=Var1, y=log(Freq))) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
tot_coll_amt : Total collection amounts ever owed
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=tot_coll_amt, y=int_rate)) + geom_hex(bins=50)
# generate feature log tot_coll_amt
loan$gen_log_tot_coll_amt = log(loan$tot_coll_amt)
options(repr.plot.width=8, repr.plot.height=4)
tot_coll_amt.data = as.data.frame(table(loan$tot_coll_amt))
tot_coll_amt.data = as.data.frame(sapply(tot_coll_amt.data[, 1:2], as.numeric))
ggplot(tot_coll_amt.data, aes(x=Var1, y=log(Freq))) +
geom_line()
pub_rec : Number of derogatory public records
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(pub_rec), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$pub_rec)), aes(x=Var1, y=log(Freq))) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
mths_since_last_major_derog : Months since most recent 90-day or worse rating
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(mths_since_last_major_derog), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=48, repr.plot.height=8)
ggplot(as.data.frame(table(loan$mths_since_last_major_derog)), aes(x=Var1, y=Freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
mths_since_last_record : The number of months since the last public record.
# drop
sum(is.na(loan$mths_since_last_record))/dim(loan)[1]
il_util : Ratio of total current balance to high credit/credit limit on all install acct
#drop
sum(is.na(loan$il_util))/dim(loan)[1]
mths_since_rcnt_il : Months since most recent installment accounts opened
# drop
sum(is.na(loan$mths_since_rcnt_il))/dim(loan)[1]
open_il_24m : Number of installment accounts opened in past 24 months
# drop
sum(is.na(loan$open_il_12m))/dim(loan)[1]
sum(is.na(loan$open_il_24m))/dim(loan)[1]
sum(is.na(loan$open_il_6m))/dim(loan)[1]
total_bal_il : Total current balance of all installment accounts
max_bal_bc : Maximum current balance owed on all revolving accounts
open_rv_12m : Number of revolving trades opened in past 12 months
open_rv_24m : Number of revolving trades opened in past 24 months
# drop
sum(is.na(loan$total_bal_il))/dim(loan)[1]
sum(is.na(loan$max_bal_bc))/dim(loan)[1]
sum(is.na(loan$open_rv_12m))/dim(loan)[1]
sum(is.na(loan$open_rv_24m))/dim(loan)[1]
revol_bal : Total credit revolving balance
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=revol_bal, y=int_rate)) + geom_hex(bins=50)
options(repr.plot.width=8, repr.plot.height=4)
revol_bal.data = as.data.frame(table(loan$revol_bal))
revol_bal.data = as.data.frame(sapply(revol_bal.data[, 1:2], as.numeric))
ggplot(revol_bal.data, aes(x=Var1, y=log(Freq))) +
geom_line()
revol_util : Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=revol_util, y=int_rate)) + geom_hex(bins=50)
revol_util.data = as.data.frame(table(loan$revol_util))
revol_util.data = as.data.frame(sapply(revol_util.data[, 1:2], as.numeric))
ggplot(revol_util.data, aes(x=Var1, y=log(Freq))) +
geom_line() + geom_point()
total_rev_hi_lim : Total revolving high credit/credit limit
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x= total_rev_hi_lim, y=int_rate)) + geom_hex(bins=50)
options(repr.plot.width=8, repr.plot.height=4)
total_rev_hi_lim.data = as.data.frame(table(loan$total_rev_hi_lim))
total_rev_hi_lim.data = as.data.frame(sapply(total_rev_hi_lim.data[, 1:2], as.numeric))
ggplot(total_rev_hi_lim.data, aes(x=Var1, y=log(Freq))) +
geom_point()
# New feature
loan$gen_log_total_rev_hi_lim = log(loan$total_rev_hi_lim)
policy_code : publicly available policy_code=1 only one value, drop
# drop
unique(loan$policy_code)
url : URL for the LC page with listing data.
# same as id, no use
loan$url[1:10]
application_type : Indicates whether the loan is an individual application or a joint application with two co-borrowers
table(loan$application_type)
aov.res = aov(int_rate ~ factor(application_type), na.action = na.omit, data = loan, )
summary(aov.res)
# split data to individual & joint
desc : Loan description provided by the borrower
# need nlp
loan$desc[1:10]
id : A unique LC assigned ID for the loan listing. no use
purpose : A category provided by the borrower for the loan request.
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(purpose), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
aov.res = aov(int_rate ~ factor(purpose), na.action = na.omit, data = loan, )
summary(aov.res)
title : The loan title provided by the borrower
loan$title = tolower(loan$title) # convert all string to lowercase
unique(loan$title)
unique(loan$title)[which(table(loan$title)>500)]
# bar plot
title.pop = sort(table(loan$title)[which(table(loan$title)>500)], decreasing = T)
title.pop = as.data.frame(title.pop)
options(repr.plot.width=30, repr.plot.height=10)
ggplot(data = title.pop, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity") +
# scale_y_log10() +
scale_x_discrete(name ="emp_titles",
limits=title.pop$Var1)+
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
term : The number of payments on the loan. Values are in months and can be either 36 or 60.
unique(loan$term)
t.test(int_rate ~ term, data = loan, na.action = na.omit) # significant
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(term), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of payments on the loan') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
issue_d : The month which the loan was funded
loan$issue_d_date = as.Date(as.yearmon(loan$issue_d, "%b-%Y"))
options(repr.plot.width=16, repr.plot.height=4)
ggplot(loan, aes(x=factor(issue_d_date), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of payments on the loan') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$issue_d_date)), aes(x=Var1, y=Freq)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
# generate feature issue_d, issue_year, issue_mon
loan$gen_issue_d <- as.Date(as.yearmon(loan$issue_d, "%b-%Y"))
loan$gen_issue_year <- as.character(format(loan$gen_issue_d, "%Y"))
loan$gen_issue_mon <- as.character(format(loan$gen_issue_d, "%m"))
initial_list_status : The initial listing status of the loan. Possible values are – W, F
unique(loan$initial_list_status)
t.test(int_rate ~ initial_list_status, data = loan, na.action = na.omit) # significant
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(initial_list_status), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='number of payments on the loan') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
loan_amnt : The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=loan_amnt, y= ..density..)) +
geom_density()
ggplot(loan, aes(x= log(loan_amnt), y=int_rate)) + geom_hex(bins=50) + stat_smooth(method=lm)
loan$gen_log_loan_amnt = log(loan$loan_amnt)
installment : The monthly payment owed by the borrower if the loan originates.
ggplot(loan, aes(x= installment, y=int_rate)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon") +
stat_smooth(method=lm)
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=installment, y= ..density..)) +
geom_density()
funded_amnt : The total amount committed to that loan at that point in time.
ggplot(loan, aes(x= funded_amnt, y=int_rate)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon") +
stat_smooth(method=lm)
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=funded_amnt, y= ..density..)) +
geom_density()
# only keep one is enough
cor(loan$funded_amnt, loan$funded_amnt_inv)
last_pymnt_amnt : Last total payment amount received.
ggplot(loan, aes(x= last_pymnt_amnt, y=int_rate)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon")
# stat_smooth(method=lm)
last_pymnt_d : Last month payment was received
loan$last_pymnt_d_date = as.Date(as.yearmon(loan$last_pymnt_d, "%b-%Y"))
options(repr.plot.width=20, repr.plot.height=8)
ggplot(loan, aes(x=factor(last_pymnt_d_date), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='emp_length_sim') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
next_pymnt_d : Next scheduled payment date
loan$next_pymnt_d_date = as.Date(as.yearmon(loan$next_pymnt_d, "%b-%Y"))
options(repr.plot.width=20, repr.plot.height=8)
ggplot(loan, aes(x=factor(next_pymnt_d_date), y=int_rate)) +
geom_boxplot(color='steelblue') + labs(x='emp_length_sim') +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none")
pymnt_plan : Indicates if a payment plan has been put in place for the loan
# basic no use
table(loan$pymnt_plan)
recoveries : post charge off gross recovery
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=recoveries, y=int_rate)) + geom_hex(bins=50) #+ stat_smooth(method=lm)
total_pymnt : Payments received to date for total amount funded
ggplot(loan, aes(x= total_pymnt, y=int_rate)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon")
# only keep one is enough
cor(loan$total_pymnt, loan$total_pymnt_inv)
total_rec_int : Interest received to date
ggplot(loan, aes(x= total_rec_int, y=int_rate)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon")
total_rec_late_fee : Late fees received to date
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x=total_rec_late_fee, y=int_rate)) + geom_hex(bins=50) #+ stat_smooth(method=lm)
cor(loan$total_rec_late_fee, loan$int_rate)
total_rec_prncp : Principal received to date
ggplot(loan, aes(x= total_rec_prncp, y=int_rate)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon")
collection_recovery_fee : post charge off collection fee
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x=collection_recovery_fee, y=int_rate)) + geom_hex(bins=50) #+ stat_smooth(method=lm)
out_prncp : Remaining outstanding principal for total amount funded
ggplot(loan, aes(x=out_prncp, y=int_rate)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon")
# keep one is enough
cor(loan$out_prncp, loan$out_prncp_inv)
num_var=c("int_rate","annual_inc","annual_inc_joint","dti",'total_acc','tot_cur_bal','open_acc','open_acc_6m',
'acc_now_delinq','delinq_2yrs','mths_since_last_delinq','collections_12_mths_ex_med','tot_coll_amt',
'pub_rec','mths_since_last_major_derog','mths_since_last_record','il_util','mths_since_rcnt_il',
'open_il_12m','open_il_24m','open_il_6m','total_bal_il','open_rv_12m','revol_bal','revol_util',
'loan_amnt','installment','funded_amnt','funded_amnt_inv','last_pymnt_amnt','recoveries','total_pymnt',
'total_pymnt_inv','total_rec_int','total_rec_late_fee','total_rec_prncp','collection_recovery_fee',
'out_prncp','out_prncp_inv')
#Plot the correlation matrix
library(corrplot)
correlations <- cor(loan[, num_var],use="complete.obs")
#print(correlations)
corrplot(correlations, method = "square", tl.cex = 1, type = 'lower')
• Remove features with too many missing value, or remove all rows with NA if you have a lot of data
• If not missing at random, add new level to represent NA, impute with 0, or generate new feature.
• If missing at random, imputation using summary stats or modeling way.
colnames(loan)
num.NA <- sort(colSums(sapply(loan, is.na)))
dfnum.NA <- data.frame(ind = c(1:length(num.NA)),
percentage = num.NA/nrow(loan),
per80 = num.NA/nrow(loan)>=0.2,
name = names(num.NA),
row.names = NULL) # convert to data.frame
options(repr.plot.width=8, repr.plot.height=4)
ggplot(data = dfnum.NA, aes(x=ind, y=percentage)) +
geom_bar(aes(fill=per80), stat="identity") +
scale_x_discrete(name ="column names",
limits=dfnum.NA$name)+
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none") +
geom_hline(yintercept = 0.2) +
ggtitle("percentage of missing")
loan_cp=loan
# drop missing > 80%
dfnum.NA$name[dfnum.NA$per80]
loan[, as.character(dfnum.NA$name[dfnum.NA$per80])] <- NULL
num.NA <- sort(colSums(sapply(loan, is.na)))
dfnum.NA <- data.frame(ind = c(1:length(num.NA)),
percentage = num.NA/nrow(loan),
per80 = num.NA/nrow(loan)>=0.2,
name = names(num.NA),
row.names = NULL) # convert to data.frame
options(repr.plot.width=8, repr.plot.height=4)
ggplot(data = dfnum.NA, aes(x=ind, y=percentage)) +
geom_bar(aes(fill=per80), stat="identity") +
scale_x_discrete(name ="column names",
limits=dfnum.NA$name)+
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
legend.position = "none") +
geom_hline(yintercept = 0.2) +
ggtitle("percentage of missing")
num.NA <- sort(sapply(loan, function(x) { sum(is.na(x))} ), decreasing = TRUE)
num.NA
for(col.i in names(num.NA)[which(num.NA > 0)]) {
loan[which(is.na(loan[, col.i])), col.i] <- median(loan[, col.i], na.rm = T)}
apply(loan, 2, function(x){length(unique(x))})
# remove all unique and single
loan[, c("id", "member_id", "url", "desc")] <- NULL
# drop loan payment features
loan[, c("installment", "funded_amnt", "funded_amnt_inv",
"last_pymnt_amnt", "last_pymnt_d", "next_pymnt_d", 'gen_add_state_zip',
"pymnt_plan", "recoveries", "total_pymnt", "verification_status", 'verification_status_joint',
"total_pymnt_inv", "total_rec_int", "total_rec_late_fee",
"total_rec_prncp", "collection_recovery_fee", "out_prncp", "out_prncp_inv")] <- NULL
# drop potential response variables
loan[, c("grade", "sub_grade", "loan_status")] <- NULL
colnames(loan)
# drop "zip_code", "addr_state", "earliest_cr_line", "policy_code"
loan[, c("zip_code", "addr_state", "policy_code", "earliest_cr_line")] <- NULL
# save the updated data
saveRDS(loan, file="loan_updated.Rda")
# split data into train and test for model performance
set.seed(1)
loan.complete = loan[complete.cases(loan), ]
train.ind <- sample(1:dim(loan.complete)[1], 0.7 * dim(loan.complete)[1])
train <- loan.complete[train.ind, ]
test <- loan.complete[-train.ind, ]
mod <- lm(int_rate ~ loan_amnt + term + annual_inc + purpose + dti + delinq_2yrs +
inq_last_6mths + open_acc + pub_rec + revol_bal + revol_util + total_acc +
initial_list_status + collections_12_mths_ex_med + application_type + acc_now_delinq +
tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim +
gen_home_ownership + gen_add_state_zip_ave_rate + gen_inq_last_6mths +
issue_d_date + gen_issue_year + gen_issue_mon
,data = train)
summary(mod)
#plot(mod)
# remove extreme outliers
train.sub = train[-which(rownames(train) %in% c(534801, 755747, 301371)), ]
mod2 <- lm(int_rate ~ loan_amnt + term + annual_inc + purpose + dti + delinq_2yrs +
inq_last_6mths + open_acc + pub_rec + revol_bal + revol_util + total_acc +
initial_list_status + collections_12_mths_ex_med + application_type + acc_now_delinq +
tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim +
gen_home_ownership + gen_add_state_zip_ave_rate + gen_inq_last_6mths +
issue_d_date + gen_issue_year + gen_issue_mon
,data = train.sub)
summary(mod2)
train.sub2 = train.sub[-which(rownames(train.sub) %in% c(52576)), ]
mod3 <- lm(int_rate ~ loan_amnt + term + annual_inc + purpose + dti + delinq_2yrs + I(loan_amnt^2) +
inq_last_6mths + open_acc + pub_rec + revol_bal + revol_util + total_acc + sqrt(annual_inc) +
initial_list_status + collections_12_mths_ex_med + application_type + acc_now_delinq +
tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim +
gen_home_ownership + gen_add_state_zip_ave_rate + gen_inq_last_6mths +
issue_d_date
,data = train.sub2)
summary(mod3)
#plot(mod3)
library(car)
vif(mod3)
mod4 <- lm(int_rate ~ loan_amnt + term + annual_inc + purpose + dti + delinq_2yrs + I(loan_amnt^2) +
open_acc + pub_rec + revol_bal + revol_util + total_acc + sqrt(annual_inc) +
initial_list_status + collections_12_mths_ex_med + application_type + acc_now_delinq +
tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim +
gen_home_ownership + gen_add_state_zip_ave_rate + issue_d_date
,data = train.sub2)
summary(mod4)
#plot(mod3)
# split data into train and test for model performance
set.seed(1)
loan.complete = loan[complete.cases(loan), ]
selected_features = c("loan_amnt", "annual_inc", "dti", "delinq_2yrs", "open_acc",
"pub_rec", "revol_bal", "revol_util", "total_acc",
"tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim", "gen_annual_inc_sqrt",
"gen_add_state_zip_ave_rate", "int_rate", "gen_loan_amnt_square")
# generate nonlinear features
loan.complete$gen_loan_amnt_square = loan.complete$loan_amnt^2
loan.complete$gen_annual_inc_sqrt = sqrt(loan.complete$annual_inc)
# generate dummy variables
dummy.variable = model.matrix(~purpose + term + gen_home_ownership +
collections_12_mths_ex_med +
acc_now_delinq + emp_length_sim - 1, loan.complete)
# numerical variable
num.variable = as.matrix(loan.complete[, selected_features])
# combine dummy with loan data
loan.sim = cbind(num.variable, dummy.variable)
train.ind <- sample(1:dim(loan.complete)[1], 0.7 * dim(loan.complete)[1])
train <- loan.sim[train.ind, ]
test <- loan.sim[-train.ind, ]
library(glmnet)
which(colnames(train)=='int_rate')
train.feature = train[, -15]
train.label = train[, 15]
test.feature = test[, -15]
test.label = test[, 15]
save(train.feature, file = "train_feature.Rdata")
save(train.label, file = "train_label.Rdata")
save(test.feature, file = "test_feature.Rdata")
save(test.label, file = "test_label.Rdata")
colnames(train.feature)
mod.glmnet <- glmnet(x=train.feature, y=train.label)
cvfit <- cv.glmnet(train.feature, train.label)
plot(cvfit)
# predict result with minimum mean cross-validated error
glm.pred = predict(cvfit, newx = test.feature, s = "lambda.min")
# cross validation result : root mean square error
mean((glm.pred - test.label)^2)^0.5